/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is MSExcelDocument.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*/
package org.terrier.indexing;
import java.io.CharArrayReader;
import java.io.CharArrayWriter;
import java.io.File;
import java.io.InputStream;
import java.io.Reader;
import java.util.Iterator;
import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.terrier.indexing.tokenisation.Tokeniser;
import org.terrier.utility.ApplicationSetup;
/** Implements a Document object for a Microsoft Excel spreadsheet.
* Uses HSSF and POIFS subparts of the Jakarta-POI project. This means
* that to use or compile this module, you must have the
* poi-?.?.?-final-*.jar in your classpath. <p>
* A bug in the current stable POI library seems to mean that large
* Excel files cannot be parsed - see the MAXFILESIZE field to control
* the maximum file size that this class will attempt to read.
* @author Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
public class MSExcelDocument extends FileDocument
{
protected static final Logger logger = Logger.getLogger(MSExcelDocument.class);
/** Size of 1MB in bytes */
protected static final int MEGABYTE = 1048576;
/** Maximum file size that this class will attempt to open. Set to 0
* to ignore. Set by propery <tt>indexing.excel.maxfilesize.mb</tt>,
* default 0.5 */
protected static final long MAXFILESIZE = (long)((float)MEGABYTE *
Float.parseFloat(
ApplicationSetup.getProperty("indexing.excel.maxfilesize.mb", "0.5")
));
/** Construct a new MSExcelDocument Document object
* @param filename the file that is opened for this
* @param docStream the actual stream of the open file */
public MSExcelDocument(String filename, InputStream docStream, Tokeniser tokeniser)
{
super(filename, docStream, tokeniser);
}
/**
* Construct a new MSExcelDocument Document object
* @param docStream
* @param docProperties
* @param tok
*/
public MSExcelDocument(InputStream docStream,
Map<String, String> docProperties, Tokeniser tok) {
super(docStream, docProperties, tok);
}
/**
* Construct a new MSExcelDocument Document object
* @param docReader
* @param docProperties
* @param tok
*/
public MSExcelDocument(Reader docReader, Map<String, String> docProperties,
Tokeniser tok) {
super(docReader, docProperties, tok);
}
/**
* Construct a new MSExcelDocument Document object
* @param filename
* @param docReader
* @param tok
*/
public MSExcelDocument(String filename, Reader docReader, Tokeniser tok) {
super(filename, docReader, tok);
}
/** Get the reader appropriate for this InputStream. This involves
converting the Excel document to a stream of words. On failure
returns null and sets EOD to true, so no terms can be read from
the object.
Uses the property <tt>indexing.excel.maxfilesize.mb</tt> to
determine if the file is too big to open
@param docStream */
@SuppressWarnings("unchecked") //poi version used is for Java 1.4.
protected Reader getReader(InputStream docStream)
{
if (MAXFILESIZE > 0 && (filename == null || new File(filename).length() > MAXFILESIZE))
{
//logger.warn("WARNING: Excel document "+ filename+
// " is too large for POI. Ignoring.");
EOD = true;
return null;
}
try
{
CharArrayWriter writer = new CharArrayWriter();
//opening the file system
POIFSFileSystem fs = new POIFSFileSystem(docStream);
//opening the work book
HSSFWorkbook workbook = new HSSFWorkbook(fs);
for (int i = 0; i < workbook.getNumberOfSheets(); i++ )
{
//got the i-th sheet from the work book
HSSFSheet sheet = workbook.getSheetAt(i);
Iterator rows = sheet.rowIterator();
while( rows.hasNext() ) {
HSSFRow row = (HSSFRow) rows.next();
Iterator cells = row.cellIterator();
while( cells.hasNext() ) {
HSSFCell cell = (HSSFCell) cells.next();
switch ( cell.getCellType() ) {
case HSSFCell.CELL_TYPE_NUMERIC:
String num = Double.toString(cell.getNumericCellValue()).trim();
if(num.length() > 0) {
writer.write(num + " ");
}
break;
case HSSFCell.CELL_TYPE_STRING:
String text = cell.getStringCellValue().trim();
if(text.length() > 0) {
writer.write(text + " ");
}
break;
}
}
}
}
return new CharArrayReader(writer.toCharArray());
}
catch(Exception e )
{
//logger.warn("WARNING: Problem converting excel document"+e);
EOD = true;
return null;
}
}
}